1   package org.apache.solr.spelling;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collections;
23  import java.util.Iterator;
24  import java.util.List;
25  import java.util.Locale;
26  import java.util.regex.Pattern;
27  
28  import org.apache.lucene.analysis.Token;
29  import org.apache.lucene.index.IndexReader;
30  import org.apache.lucene.index.Term;
31  import org.apache.lucene.search.spell.CombineSuggestion;
32  import org.apache.lucene.search.spell.SuggestWord;
33  import org.apache.lucene.search.spell.WordBreakSpellChecker;
34  import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
35  import org.apache.solr.common.util.NamedList;
36  import org.apache.solr.core.SolrCore;
37  import org.apache.solr.search.SolrIndexSearcher;
38  
39  /**
40   * <p>
41   * A spellchecker that breaks and combines words.  
42   * </p>
43   * <p>
44   * This will not combine adjacent tokens that do not have 
45   * the same required status (prohibited, required, optional).  
46   * However, this feature depends on incoming term flags 
47   * being properly set. ({@link QueryConverter#PROHIBITED_TERM_FLAG},
48   * {@link QueryConverter#REQUIRED_TERM_FLAG}, 
49   * {@link QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and
50   * {@link QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} )
51   * This feature breaks completely if the upstream analyzer or query
52   * converter sets flags with the same values but different meanings.
53   * The default query converter (if not using "spellcheck.q") 
54   * is {@link SpellingQueryConverter}, which properly sets these flags.
55   * </p>
56   */
57  public class WordBreakSolrSpellChecker extends SolrSpellChecker {
58    /**
59     * <p>
60     * Try to combine multiple words into one? [true|false]
61     * </p>
62     */
63    public static final String PARAM_COMBINE_WORDS = "combineWords";
64    /**
65     * <p>
66     * Try to break words into multiples? [true|false]
67     * </p>
68     */
69    public static final String PARAM_BREAK_WORDS = "breakWords";
70    /**
71     * See {@link WordBreakSpellChecker#setMaxChanges}
72     */
73    public static final String PARAM_MAX_CHANGES = "maxChanges";
74    /**
75     * See {@link WordBreakSpellChecker#setMaxCombineWordLength}
76     */
77    public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength";
78    /**
79     * See {@link WordBreakSpellChecker#setMinBreakWordLength}
80     */
81    public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";
82    /**
83     * See {@link BreakSuggestionTieBreaker} for options.
84     */
85    public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSugestionTieBreaker";
86    /**
87     * See {@link WordBreakSpellChecker#setMaxEvaluations}
88     */
89    public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";
90    /**
91     * See {@link WordBreakSpellChecker#setMinSuggestionFrequency}
92     */
93    public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq";
94    
95    /**
96     * <p>
97     *  Specify a value on the "breakSugestionTieBreaker" parameter.
98     *    The default is MAX_FREQ.
99     * </p>  
100    */
101   public enum BreakSuggestionTieBreaker {
102     /**
103      * See
104      * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
105      * #
106      */
107     MAX_FREQ,
108     /**
109      * See
110      * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY}
111      */
112     SUM_FREQ
113   };
114   
115   private WordBreakSpellChecker wbsp = null;
116   private boolean combineWords = false;
117   private boolean breakWords = false;
118   private BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
119   private static final Pattern spacePattern = Pattern.compile("\\s+");
120 
121   @Override
122   public String init(@SuppressWarnings("unchecked") NamedList config,
123       SolrCore core) {
124     String name = super.init(config, core);
125     combineWords = boolParam(config, PARAM_COMBINE_WORDS);
126     breakWords = boolParam(config, PARAM_BREAK_WORDS);
127     wbsp = new WordBreakSpellChecker();
128     String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
129     if (bstb != null) {
130       bstb = bstb.toUpperCase(Locale.ROOT);
131       if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
132         sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY;
133       } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) {
134         sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
135       } else {
136         throw new IllegalArgumentException("Invalid value for parameter "
137             + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb);
138       }
139     } else {
140       sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
141     }
142     int mc = intParam(config, PARAM_MAX_CHANGES);
143     if (mc > 0) {
144       wbsp.setMaxChanges(mc);
145     }
146     int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH);
147     if (mcl > 0) {
148       wbsp.setMaxCombineWordLength(mcl);
149     }
150     int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH);
151     if (mbwl > 0) {
152       wbsp.setMinBreakWordLength(mbwl);
153     }
154     int me = intParam(config, PARAM_MAX_EVALUATIONS);
155     if (me > 0) {
156       wbsp.setMaxEvaluations(me);
157     }
158     int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY);
159     if (msf > 0) {
160       wbsp.setMinSuggestionFrequency(msf);
161     }
162     return name;
163   }
164   
165   private String strParam(@SuppressWarnings("unchecked") NamedList config,
166       String paramName) {
167     Object o = config.get(paramName);
168     return o == null ? null : o.toString();
169   }
170   
171   private boolean boolParam(@SuppressWarnings("unchecked") NamedList config,
172       String paramName) {
173     String s = strParam(config, paramName);
174     if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) {
175       return true;
176     }
177     return false;
178   }
179   
180   private int intParam(@SuppressWarnings("unchecked") NamedList config,
181       String paramName) {
182     Object o = config.get(paramName);
183     if (o == null) {
184       return 0;
185     }
186     try {
187       return Integer.parseInt(o.toString());
188     } catch (NumberFormatException nfe) {
189       throw new IllegalArgumentException("Invalid integer for parameter "
190           + paramName + " : " + o);
191     }
192   }
193   
194   @Override
195   public SpellingResult getSuggestions(SpellingOptions options)
196       throws IOException {
197     IndexReader ir = options.reader;
198     int numSuggestions = options.count;
199     
200     StringBuilder sb = new StringBuilder();
201     Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
202     List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
203     
204     List<ResultEntry> breakSuggestionList = new ArrayList<>();
205     List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
206     boolean lastOneProhibited = false;
207     boolean lastOneRequired = false;
208     boolean lastOneprocedesNewBooleanOp = false;
209     for (int i = 0; i < tokenArr.length; i++) {      
210       boolean prohibited = 
211         (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == 
212           QueryConverter.PROHIBITED_TERM_FLAG;
213       boolean required = 
214         (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == 
215           QueryConverter.REQUIRED_TERM_FLAG;
216       boolean procedesNewBooleanOp = 
217         (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == 
218           QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
219       if (i > 0
220           && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
221         termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
222       }
223       lastOneProhibited = prohibited;
224       lastOneRequired = required;
225       lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
226       
227       Term thisTerm = new Term(field, tokenArr[i].toString());
228       termArr.add(thisTerm);
229       if (breakWords) {
230         SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm,
231             numSuggestions, ir, options.suggestMode, sortMethod);
232         if(breakSuggestions.length==0) {
233           noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
234         }
235         for (SuggestWord[] breakSuggestion : breakSuggestions) {
236           sb.delete(0, sb.length());
237           boolean firstOne = true;
238           int freq = 0;
239           for (SuggestWord word : breakSuggestion) {
240             if (!firstOne) {
241               sb.append(" ");
242             }
243             firstOne = false;
244             sb.append(word.string);
245             if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
246               freq = Math.max(freq, word.freq);
247             } else {
248               freq += word.freq;
249             }
250           }
251           breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(),
252               freq));
253         }
254       }
255     }
256     breakSuggestionList.addAll(noBreakSuggestionList);
257     
258     List<ResultEntry> combineSuggestionList = Collections.emptyList();
259     CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(
260         termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
261     if (combineWords) {
262       combineSuggestionList = new ArrayList<>(
263           combineSuggestions.length);
264       for (CombineSuggestion cs : combineSuggestions) {
265         int firstTermIndex = cs.originalTermIndexes[0];
266         int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
267         sb.delete(0, sb.length());
268         for (int i = firstTermIndex; i <= lastTermIndex; i++) {
269           if (i > firstTermIndex) {
270             sb.append(" ");
271           }
272           sb.append(tokenArr[i].toString());
273         }
274         Token token = new Token(sb.toString(), tokenArr[firstTermIndex]
275             .startOffset(), tokenArr[lastTermIndex].endOffset());
276         combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string,
277             cs.suggestion.freq));
278       }
279     }
280     
281     // Interleave the two lists of suggestions into one SpellingResult
282     SpellingResult result = new SpellingResult();
283     Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
284     Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
285     ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
286     ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
287     int breakCount = 0;
288     int combineCount = 0;
289     while (lastBreak != null || lastCombine != null) {
290       if (lastBreak == null) {
291         addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
292         lastCombine = null;
293       } else if (lastCombine == null) {
294         addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
295         lastBreak = null;
296       } else if (lastBreak.freq < lastCombine.freq) {
297         addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);        
298         lastCombine = null;
299       } else if (lastCombine.freq < lastBreak.freq) {
300         addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
301         lastBreak = null;
302       } else if (breakCount >= combineCount) { //TODO: Should reverse >= to < ??S
303         addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);        
304         lastCombine = null;
305       } else {
306         addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);        
307         lastBreak = null;
308       }
309       if (lastBreak == null && breakIter.hasNext()) {
310         lastBreak = breakIter.next();
311         breakCount++;
312       }
313       if (lastCombine == null && combineIter.hasNext()) {
314         lastCombine = combineIter.next();
315         combineCount++;
316       }
317     }
318     return result;
319   }
320   private void addToResult(SpellingResult result, Token token, int tokenFrequency, String suggestion, int suggestionFrequency) {
321     if(suggestion==null) {
322       result.add(token, Collections.<String>emptyList());
323       result.addFrequency(token, tokenFrequency);
324     } else {
325       result.add(token, suggestion, suggestionFrequency);
326       result.addFrequency(token, tokenFrequency);
327     }
328   }
329   
330   private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
331     String[] words = spacePattern.split(token.toString());
332     int result = 0;
333     if(sortMethod==BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {      
334       for(String word : words) {
335         result = Math.max(result, ir.docFreq(new Term(field, word)));
336       }
337     } else {
338       for(String word : words) {
339         result += ir.docFreq(new Term(field, word));
340       }
341     }
342     return result;
343   }
344   
345   @Override
346   public void build(SolrCore core, SolrIndexSearcher searcher) {
347   /* no-op */
348   }
349   
350   @Override
351   public void reload(SolrCore core, SolrIndexSearcher searcher)
352       throws IOException {
353   /* no-op */
354   }
355   
356   @Override
357   public boolean isSuggestionsMayOverlap() {
358     return true;
359   }
360 }